import os
NOVA_HOME = "/home/labs/hornsteinlab/Collaboration/NOVA_GAL/NOVA"
NOVA_DATA_HOME = '/home/labs/hornsteinlab/Collaboration/FUNOVA'
LOGS_PATH = os.path.join(NOVA_DATA_HOME, "outputs/preprocessing/logs/")
PLOT_PATH = os.path.join(NOVA_DATA_HOME, "outputs/logs/")
os.chdir(NOVA_HOME)
import pandas as pd
import contextlib
import io
from IPython.display import display, Javascript
import seaborn as sns
from tools.preprocessing_tools.qc_reports.qc_utils import log_files_qc, run_validate_folder_structure, display_diff, sample_and_calc_variance, \
show_site_survival_dapi_brenner, show_site_survival_dapi_cellpose, \
show_site_survival_dapi_tiling, show_site_survival_target_brenner, \
calc_total_sums, plot_filtering_heatmap, show_total_sum_tables, \
plot_cell_count, plot_catplot, plot_hm_combine_batches, plot_hm, \
run_calc_hist_new, plot_marker_data
from tools.preprocessing_tools.qc_reports.qc_config import (
funova_markers as markers,
funova_cell_lines as cell_lines,
funova_cell_lines_to_cond as cell_lines_to_cond,
funova_cell_lines_for_disp as cell_lines_for_disp,
funova_reps as reps,
funova_line_colors as line_colors,
funova_lines_order as lines_order,
funova_custom_palette as custom_palette,
funova_expected_dapi_raw as expected_dapi_raw,
funova_panels as panels,
funova_marker_info as marker_info
)
%load_ext autoreload
%autoreload 2
# choose batches
batches = ['Batch1', 'Batch2']#, 'batch2', 'batch3']
batches
['Batch1', 'Batch2']
I have created a folder called 'Batch1' in the logs dir and put all files inside
validate_antibody = False
df = log_files_qc(LOGS_PATH, batches, only_wt_cond = False)
reading logs of Batch1 Total of 1 files were read. Before dup handeling (182036, 21) After duplication removal #1: (182036, 22) After duplication removal #2: (182036, 22)
df['filename'] = df['filename'].str.split('-').str[0]
df['site_num'] = df['site_num'].str.split('-').str[0]
df_dapi = df[df.marker=='DAPI']
df_target = df[df.marker!='DAPI']
root_directory_raw = os.path.join(NOVA_DATA_HOME, 'input', 'images', 'raw')
# import os
# import pandas as pd
# # Define the root path
# root_path = "/home/labs/hornsteinlab/Collaboration/FUNOVA/input/images/raw/"
# # Initialize a dictionary to store the data
# data = {}
# # Walk through the directory structure
# for dirpath, dirnames, filenames in os.walk(root_path):
# # Check if the current path contains 'repX' and a subfolder for the marker
# if os.path.basename(os.path.dirname(dirpath)).startswith("rep"): # Check for 'repX' in parent folder
# marker = os.path.basename(dirpath) # The marker is the current folder
# panel_folder = os.path.basename(os.path.dirname(os.path.dirname(os.path.dirname(dirpath)))) # Three levels up for the panel
# if panel_folder.lower().startswith("panel"): # Ensure it's a panel folder
# panel = panel_folder.replace("panel", "").strip()
# # Add marker and panel information
# if marker not in data:
# data[marker] = {"Antibody": [], "panel": []} # Initialize marker entry
# data[marker]["panel"].append(panel) # Add panel to the marker
# # Convert the data to a DataFrame
# final_marker_info = pd.DataFrame.from_dict(data, orient="index")
# # Ensure 'panel' and 'Antibody' are arrays and unique panels
# final_marker_info["panel"] = final_marker_info["panel"].apply(lambda x: sorted(set(x))) # Remove duplicates and sort
# final_marker_info["Antibody"] = [[] for _ in range(len(final_marker_info))] # Ensure Antibody is an empty array
batches_raw = [batch.replace("_16bit_no_downsample","") for batch in batches]
raws = run_validate_folder_structure(root_directory_raw, False, panels, markers.copy(), PLOT_PATH, marker_info,
cell_lines_to_cond, reps, cell_lines_for_disp, expected_dapi_raw,
batches=batches_raw, fig_width=8,fig_height = 40,
expected_count=100, validate_antibody = validate_antibody)
Batch1 Folder structure is valid. No bad files are found. Total Sites: 118400
======== Batch2 Folder structure is valid. No bad files are found. Total Sites: 118400
======== ====================
# ## Missing data issue was fixed
# differences = (raws[0] != raws[1]).stack()
# differences = differences[differences].index.to_frame(index=False)
# differences.columns = ["Marker", "Rep", "Condition"]
# for condition in differences["Condition"].unique():
# print(f"Condition: {condition}")
# condition_data = differences[differences["Condition"] == condition]
# for rep in condition_data["Rep"].unique():
# markers = condition_data[condition_data["Rep"] == rep]["Marker"].tolist()
# print(f" Rep: {rep}")
# print(f" Markers: {', '.join(markers)}")
root_directory_proc = os.path.join(NOVA_DATA_HOME, 'input', 'images', 'processed')
procs = run_validate_folder_structure(root_directory_proc, True, panels, markers, PLOT_PATH, marker_info,
cell_lines_to_cond, reps, cell_lines_for_disp, expected_dapi_raw,
batches=batches, fig_width=8,fig_height = 40,
expected_count=100, validate_antibody = validate_antibody)
Batch1 Folder structure is valid. No bad files are found. Total Sites: 91000
======== Batch2 Folder structure is invalid. Missing 1 paths: /home/labs/hornsteinlab/Collaboration/FUNOVA/input/images/processed/Batch2/C9orf72-HRE/stress/Neuronal_activity No bad files are found. Total Sites: 77986
======== ====================
display_diff(batches, raws, procs, PLOT_PATH, fig_width=8, fig_height = 40)
Batch1
======== Batch2
========
for batch in batches:
with contextlib.redirect_stdout(io.StringIO()):
var = sample_and_calc_variance(root_directory_proc, batch,
sample_size_per_markers=200, cond_count=1, rep_count=len(reps),
num_markers=len(markers))
print(f'{batch} var: ',var)
Batch1 var: 0.01778099779891865 Batch2 var: 0.018942129863741438
By order of filtering
Percentage out of the total sites
dapi_filter_by_brenner = show_site_survival_dapi_brenner(df_dapi,batches, line_colors, panels,
figsize=(6,18), reps=reps, vmax=100)
Percentage out of the sites that passed the previous filter. In parenthesis are absolute values.
A site will be filtered out if Cellpose found 0 cells in it.
dapi_filter_by_cellpose = show_site_survival_dapi_cellpose(df_dapi, batches, dapi_filter_by_brenner,
line_colors, panels, figsize=(6,18), reps=reps)
Percentage out of the sites that passed the previous filter. In parenthesis are absolute values.
A site will be filtered out if after tiling, no tile is containing at least 85% of a cell that Cellpose detected.
dapi_filter_by_tiling=show_site_survival_dapi_tiling(df_dapi, batches, dapi_filter_by_cellpose,
line_colors, panels, figsize=(6,18), reps=reps)
Percentage out of the sites that passed the previous filter. In parenthesis are absolute values (if different than the percentages).
show_site_survival_target_brenner(df_dapi, df_target, dapi_filter_by_tiling,
figsize=(6,24), markers=markers)
names = ['Total number of tiles', 'Total number of whole cells']
stats = ['n_valid_tiles','site_whole_cells_counts_sum','site_cell_count','site_cell_count_sum']
total_sum = calc_total_sums(df_target, df_dapi, stats, markers)
plot_marker_data(total_sum, split_by_cell_line=True)
/home/labs/hornsteinlab/Collaboration/NOVA_GAL/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:1304: FutureWarning: The `ci` parameter is deprecated. Use `errorbar=None` for the same effect. sns.barplot(
/home/labs/hornsteinlab/Collaboration/NOVA_GAL/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:1304: FutureWarning: The `ci` parameter is deprecated. Use `errorbar=None` for the same effect. sns.barplot(
/home/labs/hornsteinlab/Collaboration/NOVA_GAL/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:1304: FutureWarning: The `ci` parameter is deprecated. Use `errorbar=None` for the same effect. sns.barplot(
/home/labs/hornsteinlab/Collaboration/NOVA_GAL/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:1304: FutureWarning: The `ci` parameter is deprecated. Use `errorbar=None` for the same effect. sns.barplot(
/home/labs/hornsteinlab/Collaboration/NOVA_GAL/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:1304: FutureWarning: The `ci` parameter is deprecated. Use `errorbar=None` for the same effect. sns.barplot(
/home/labs/hornsteinlab/Collaboration/NOVA_GAL/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:1304: FutureWarning: The `ci` parameter is deprecated. Use `errorbar=None` for the same effect. sns.barplot(
/home/labs/hornsteinlab/Collaboration/NOVA_GAL/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:1304: FutureWarning: The `ci` parameter is deprecated. Use `errorbar=None` for the same effect. sns.barplot(
/home/labs/hornsteinlab/Collaboration/NOVA_GAL/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:1304: FutureWarning: The `ci` parameter is deprecated. Use `errorbar=None` for the same effect. sns.barplot(
/home/labs/hornsteinlab/Collaboration/NOVA_GAL/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:1304: FutureWarning: The `ci` parameter is deprecated. Use `errorbar=None` for the same effect. sns.barplot(
/home/labs/hornsteinlab/Collaboration/NOVA_GAL/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:1304: FutureWarning: The `ci` parameter is deprecated. Use `errorbar=None` for the same effect. sns.barplot(
/home/labs/hornsteinlab/Collaboration/NOVA_GAL/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:1304: FutureWarning: The `ci` parameter is deprecated. Use `errorbar=None` for the same effect. sns.barplot(
/home/labs/hornsteinlab/Collaboration/NOVA_GAL/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:1304: FutureWarning: The `ci` parameter is deprecated. Use `errorbar=None` for the same effect. sns.barplot(
/home/labs/hornsteinlab/Collaboration/NOVA_GAL/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:1304: FutureWarning: The `ci` parameter is deprecated. Use `errorbar=None` for the same effect. sns.barplot(
/home/labs/hornsteinlab/Collaboration/NOVA_GAL/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:1304: FutureWarning: The `ci` parameter is deprecated. Use `errorbar=None` for the same effect. sns.barplot(
/home/labs/hornsteinlab/Collaboration/NOVA_GAL/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:1304: FutureWarning: The `ci` parameter is deprecated. Use `errorbar=None` for the same effect. sns.barplot(
/home/labs/hornsteinlab/Collaboration/NOVA_GAL/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:1304: FutureWarning: The `ci` parameter is deprecated. Use `errorbar=None` for the same effect. sns.barplot(
plot_marker_data(total_sum, split_by_cell_line=False)
/home/labs/hornsteinlab/Collaboration/NOVA_GAL/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:1304: FutureWarning: The `ci` parameter is deprecated. Use `errorbar=None` for the same effect. sns.barplot(
/home/labs/hornsteinlab/Collaboration/NOVA_GAL/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:1304: FutureWarning: The `ci` parameter is deprecated. Use `errorbar=None` for the same effect. sns.barplot(
total_sum.n_valid_tiles.sum()
955381
total_sum[total_sum.marker =='DAPI'].site_whole_cells_counts_sum.sum()
411684.0
total_sum[total_sum.marker =='DAPI'].site_cell_count.sum()
1106239.0
show_total_sum_tables(total_sum)
| n_valid_tiles | % valid tiles | site_whole_cells_counts_sum | site_cell_count | |
|---|---|---|---|---|
| Batch1 | ||||
| count | 587.000000 | 587.000000 | 587.000000 | 5.870000e+02 |
| mean | 958.589438 | 9.585894 | 1165.614991 | 3.118043e+03 |
| std | 979.237781 | 9.792378 | 1225.856911 | 3.556683e+03 |
| min | 1.000000 | 0.010000 | 1.000000 | 1.000000e+00 |
| 25% | 268.500000 | 2.685000 | 309.000000 | 7.505000e+02 |
| 50% | 525.000000 | 5.250000 | 619.000000 | 1.492000e+03 |
| 75% | 1111.000000 | 11.110000 | 1303.500000 | 2.809000e+03 |
| max | 3736.000000 | 37.360000 | 4768.000000 | 1.391200e+04 |
| sum | 562692.000000 | NaN | 684216.000000 | 1.830291e+06 |
| expected_count | 450.000000 | 450.000000 | 450.000000 | 4.500000e+02 |
| n_valid_tiles | % valid tiles | site_whole_cells_counts_sum | site_cell_count | |
|---|---|---|---|---|
| Batch2 | ||||
| count | 587.000000 | 587.000000 | 587.000000 | 5.870000e+02 |
| mean | 668.976150 | 6.689761 | 824.144804 | 2.182400e+03 |
| std | 746.778862 | 7.467789 | 948.683856 | 2.795358e+03 |
| min | 1.000000 | 0.010000 | 1.000000 | 1.000000e+00 |
| 25% | 178.500000 | 1.785000 | 207.000000 | 4.530000e+02 |
| 50% | 337.000000 | 3.370000 | 408.000000 | 9.070000e+02 |
| 75% | 824.000000 | 8.240000 | 915.500000 | 2.273000e+03 |
| max | 3645.000000 | 36.450000 | 4537.000000 | 1.366000e+04 |
| sum | 392689.000000 | NaN | 483773.000000 | 1.281069e+06 |
| expected_count | 450.000000 | 450.000000 | 450.000000 | 4.500000e+02 |
| n valid tiles | % valid tiles | site_whole_cells_counts_sum | site_cell_count | |
|---|---|---|---|---|
| All batches | ||||
| count | 1174.000000 | 1174.000000 | 1.174000e+03 | 1.174000e+03 |
| mean | 813.782794 | 8.137828 | 9.948799e+02 | 2.650221e+03 |
| std | 882.401856 | 8.824019 | 1.108835e+03 | 3.231459e+03 |
| min | 1.000000 | 0.010000 | 1.000000e+00 | 1.000000e+00 |
| 25% | 225.000000 | 2.250000 | 2.572500e+02 | 5.775000e+02 |
| 50% | 430.500000 | 4.305000 | 5.045000e+02 | 1.166000e+03 |
| 75% | 1004.500000 | 10.045000 | 1.172500e+03 | 2.724750e+03 |
| max | 3736.000000 | 37.360000 | 4.768000e+03 | 1.391200e+04 |
| sum | 955381.000000 | NaN | 1.167989e+06 | 3.111360e+06 |
| expected_count | 450.000000 | 450.000000 | 4.500000e+02 | 4.500000e+02 |
For each batch, cell line, replicate and markerTotal number of tiles
to_heatmap = total_sum.rename(columns={'n_valid_tiles':'index'})
plot_filtering_heatmap(to_heatmap, extra_index='marker', vmin=None, vmax=None,
xlabel = 'Total number of tiles', show_sum=True, figsize=(6,24))
/home/labs/hornsteinlab/Collaboration/NOVA_GAL/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:386: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator. ax.set_yticklabels(ax.get_yticklabels(), fontsize=10) /home/labs/hornsteinlab/Collaboration/NOVA_GAL/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:386: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator. ax.set_yticklabels(ax.get_yticklabels(), fontsize=10)
/home/labs/hornsteinlab/Collaboration/NOVA_GAL/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:386: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator. ax.set_yticklabels(ax.get_yticklabels(), fontsize=10) /home/labs/hornsteinlab/Collaboration/NOVA_GAL/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:386: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator. ax.set_yticklabels(ax.get_yticklabels(), fontsize=10)
For each batch, cell line, replicate and markerTotal number of tiles
to_heatmap = total_sum.rename(columns={'site_whole_cells_counts_sum':'index'})
plot_filtering_heatmap(to_heatmap, extra_index='marker', vmin=None, vmax=None,
xlabel = 'Total number of whole cells', show_sum=True, figsize=(6,24))
/home/labs/hornsteinlab/Collaboration/NOVA_GAL/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:386: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator. ax.set_yticklabels(ax.get_yticklabels(), fontsize=10) /home/labs/hornsteinlab/Collaboration/NOVA_GAL/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:386: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator. ax.set_yticklabels(ax.get_yticklabels(), fontsize=10)
/home/labs/hornsteinlab/Collaboration/NOVA_GAL/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:386: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator. ax.set_yticklabels(ax.get_yticklabels(), fontsize=10) /home/labs/hornsteinlab/Collaboration/NOVA_GAL/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:386: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator. ax.set_yticklabels(ax.get_yticklabels(), fontsize=10)
df_no_empty_sites = df_dapi[df_dapi.n_valid_tiles !=0]
plot_cell_count(df_no_empty_sites, lines_order, custom_palette, y='site_cell_count_sum',
title='Cell Count Average per Site (from tiles)', figsize=(16,6))
plot_cell_count(df_no_empty_sites, lines_order, custom_palette, y='site_whole_cells_counts_sum',
title='Whole Cell Count Average per Site',figsize=(16,6))
plot_cell_count(df_no_empty_sites, lines_order, custom_palette, y='site_cell_count',
title='Cellpose Cell Count Average per Site',figsize=(16,6))
df_dapi.groupby(['cell_line_cond']).n_valid_tiles.mean()
cell_line_cond C9orf72-HRE Untreated 3.509666 C9orf72-HRE stress 3.177155 Control Untreated 6.673802 Control stress 5.215105 TDP--43-G348V Untreated 2.422217 TDP--43-G348V stress 1.821556 TDP--43-N390D Untreated 4.763732 TDP--43-N390D stress 3.445943 Name: n_valid_tiles, dtype: float64
plot_catplot(df_dapi, sns.color_palette('colorblind'), reps=reps,x='cell_line', y_title='Valid Tiles Count', x_title='Cell Line', y='n_valid_tiles', hue='rep',
height=4, aspect=2)
/home/labs/hornsteinlab/Collaboration/NOVA_GAL/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:1024: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df.loc[:, 'batch_rep'] = df['batch'] + " " + df['rep'] /home/labs/hornsteinlab/Collaboration/NOVA_GAL/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:1035: UserWarning: The palette list has more values (10) than needed (2), which may not be intended. g = sns.catplot(kind='box', data=df, y=y, x=x,height=height, aspect=aspect, hue=hue, palette=palette,
# plot_hm(df_dapi, split_by='rep', rows='cell_line', columns='panel', vmax=3)
# for batch in batches:
# print(batch)
# run_calc_hist_new(batch,cell_lines_for_disp, markers, root_directory_raw, root_directory_proc,
# hist_sample=10,sample_size_per_markers=10, ncols=4, nrows=1, figsize=(6,2))
# print("="*30)
# save notebook as HTML ( the HTML will be saved in the same folder the original script is)
display(Javascript('IPython.notebook.save_checkpoint();'))
os.system(f'jupyter nbconvert --to html tools/preprocessing_tools/qc_reports/qc_report_funova.ipynb --output {NOVA_HOME}/manuscript/preprocessing_qc_reports/qc_report_funova.html')
[NbConvertApp] Converting notebook tools/preprocessing_tools/qc_reports/qc_report_funova.ipynb to html [NbConvertApp] Writing 7536693 bytes to /home/labs/hornsteinlab/Collaboration/NOVA_GAL/NOVA/manuscript/preprocessing_qc_reports/qc_report_funova.html
0